Load libraries

library(tidyverse)
library(piano)
library(edgeR)

Set wd

Load lm results from transcripts_lm

lm_res = readRDS("../github/results/linear_models/transcripts/lm_res_symbols.rds")

head(lm_res)

Load gene sets

kegg_gsc = loadGSC(file="gene_sets/c2.cp.kegg.v7.2.symbols.gmt")
hallmarks_gsc = loadGSC(file="gene_sets/h.all.v7.2.symbols.gmt")

Comp. YesvsNo

FDR correction

lm_res %>% 
  filter(term == "spliceosome_mutatedYES") %>% 
  mutate(fdr=p.adjust(p.value, method = "fdr")) %>% 
  arrange(p.value) %>% 
  head()

Extract FDR adjusted p-values and logFC (estimate)

cat("p-values \n")
p-values 
cat("p-values \n")
p-values 
head(p_adj)
     PSMD2      SRRM2       COPA     NDUFS7       TAB2     PABPC1 
0.01134224 0.01134224 0.01134224 0.01134224 0.01134224 0.01417218 
cat("\nlog2-FC \n")

log2-FC 
head(log2fc)
    PSMD2     SRRM2      COPA    NDUFS7      TAB2    PABPC1 
1.6451148 1.0252466 0.8912047 0.8341501 0.6651474 1.3064701 

Run PIANO For kegg gs.

gsa_res
Final gene/gene-set association: 3082 genes and 50 gene-sets
  Details:
  Calculating gene set statistics from 3082 out of 36228 gene-level statistics
  Removed 1301 genes from GSC due to lack of matching gene statistics
  Removed 0 gene sets containing no genes after gene removal
  Removed additionally 0 gene sets not matching the size limits
  Loaded additional information for 50 gene sets

Gene statistic type: p-like
Method: reporter
Gene-set statistic name: Z 
Significance: Theoretical null distribution
Adjustment: fdr
Gene set size limit: (1,Inf)
Permutations: 10000 
Total run time: 1.5 min
GSAheatmap(gsaRes = gsa_res)

Run PIANO on KEGG gs, using non-adj p-values

gsa_adj_kegg <- runGSA(p_val, 
                  log2fc,
                  gsc = kegg_gsc,
                  ncpus=8, 
                  geneSetStat = "reporter",
                  signifMethod = "nullDist",
                  nPerm = 10000)
Running gene set analysis:
Checking arguments...
Found duplicates in rownames(geneLevelStats), all values will be used for calculation of gene set statistics. It is recommended to avoid this and handle duplicates prior to running runGSA. In particular, the GSEA and FGSEA implementations will give different results!
done!
Final gene/gene-set association: 2984 genes and 186 gene sets
  Details:
  Calculating gene set statistics from 2984 out of 36228 gene-level statistics
  Removed 2261 genes from GSC due to lack of matching gene statistics
  Removed 0 gene sets containing no genes after gene removal
  Removed additionally 0 gene sets not matching the size limits
  Loaded additional information for 186 gene sets
Calculating gene set statistics...done!
Calculating gene set significance...done!
Adjusting for multiple testing...done!

Run PIANO on KEGG gs, using gene sampling to assess significance

gsa_kegg_geneS <- runGSA(p_val, 
                  log2fc,
                  gsc = kegg_gsc,
                  ncpus=8, 
                  geneSetStat = "reporter",
                  signifMethod = "geneSampling",
                  nPerm = 10000)
Running gene set analysis:
Checking arguments...
Found duplicates in rownames(geneLevelStats), all values will be used for calculation of gene set statistics. It is recommended to avoid this and handle duplicates prior to running runGSA. In particular, the GSEA and FGSEA implementations will give different results!
done!
Final gene/gene-set association: 2984 genes and 186 gene sets
  Details:
  Calculating gene set statistics from 2984 out of 36228 gene-level statistics
  Using all 36228 gene-level statistics for significance estimation
  Removed 2261 genes from GSC due to lack of matching gene statistics
  Removed 0 gene sets containing no genes after gene removal
  Removed additionally 0 gene sets not matching the size limits
  Loaded additional information for 186 gene sets
Calculating gene set statistics...done!
Calculating gene set significance...done!
Adjusting for multiple testing...done!

Run PIANO on hallmarks gsc, theoretical null distribution.

gsa_res <- runGSA(p_adj, 
                  log2fc,
                  gsc = hallmarks_gsc,
                  ncpus=8, 
                  geneSetStat = "reporter",
                  signifMethod = "nullDist",
                  nPerm = 10000)
Running gene set analysis:
Checking arguments...
Found duplicates in rownames(geneLevelStats), all values will be used for calculation of gene set statistics. It is recommended to avoid this and handle duplicates prior to running runGSA. In particular, the GSEA and FGSEA implementations will give different results!
done!
Final gene/gene-set association: 3082 genes and 50 gene sets
  Details:
  Calculating gene set statistics from 3082 out of 36228 gene-level statistics
  Removed 1301 genes from GSC due to lack of matching gene statistics
  Removed 0 gene sets containing no genes after gene removal
  Removed additionally 0 gene sets not matching the size limits
  Loaded additional information for 50 gene sets
Calculating gene set statistics...done!
Calculating gene set significance...done!
Adjusting for multiple testing...done!
GSAheatmap(gsaRes = gsa_res)

Run PIANO on hallmarks gsc, gene sampling.

gsa_res <- runGSA(p_adj, 
                  log2fc,
                  gsc = hallmarks_gsc,
                  ncpus=8, 
                  geneSetStat = "reporter",
                  signifMethod = "geneSampling",
                  nPerm = 10000)
Running gene set analysis:
Checking arguments...
Found duplicates in rownames(geneLevelStats), all values will be used for calculation of gene set statistics. It is recommended to avoid this and handle duplicates prior to running runGSA. In particular, the GSEA and FGSEA implementations will give different results!
done!
Final gene/gene-set association: 3082 genes and 50 gene sets
  Details:
  Calculating gene set statistics from 3082 out of 36228 gene-level statistics
  Using all 36228 gene-level statistics for significance estimation
  Removed 1301 genes from GSC due to lack of matching gene statistics
  Removed 0 gene sets containing no genes after gene removal
  Removed additionally 0 gene sets not matching the size limits
  Loaded additional information for 50 gene sets
Calculating gene set statistics...done!
Calculating gene set significance...done!
Adjusting for multiple testing...done!
GSAheatmap(gsaRes = gsa_res)

Extract matrix

Produce tidy heatmap

LS0tCnRpdGxlOiAiUElBTk8gYW5hbHlzaXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KIyBMb2FkIGxpYnJhcmllcwpgYGB7cn0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkocGlhbm8pCmxpYnJhcnkoZWRnZVIpCmBgYAoKU2V0IHdkCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFLCBlY2hvPUZBTFNFfQpyZXF1aXJlKCJrbml0ciIpCm9wdHNfa25pdCRzZXQocm9vdC5kaXIgPSAiL1VzZXJzL2Nhc3RpbGxuL0Rlc2t0b3AvdGhlc2lzL2xvY2FsZGF0YSIpCmBgYAoKCkxvYWQgbG0gcmVzdWx0cyBmcm9tIHRyYW5zY3JpcHRzX2xtCmBgYHtyfQpsbV9yZXMgPSByZWFkUkRTKCIuLi9naXRodWIvcmVzdWx0cy9saW5lYXJfbW9kZWxzL3RyYW5zY3JpcHRzL2xtX3Jlc19zeW1ib2xzLnJkcyIpCgpoZWFkKGxtX3JlcykKYGBgCgpMb2FkIGdlbmUgc2V0cyAKYGBge3J9CmtlZ2dfZ3NjID0gbG9hZEdTQyhmaWxlPSJnZW5lX3NldHMvYzIuY3Aua2VnZy52Ny4yLnN5bWJvbHMuZ210IikKaGFsbG1hcmtzX2dzYyA9IGxvYWRHU0MoZmlsZT0iZ2VuZV9zZXRzL2guYWxsLnY3LjIuc3ltYm9scy5nbXQiKQpgYGAKCgojIENvbXAuIFllc3ZzTm8KCkZEUiBjb3JyZWN0aW9uCmBgYHtyfQpsbV9yZXMgJT4lIAogIGZpbHRlcih0ZXJtID09ICJzcGxpY2Vvc29tZV9tdXRhdGVkWUVTIikgJT4lIAogIG11dGF0ZShmZHI9cC5hZGp1c3QocC52YWx1ZSwgbWV0aG9kID0gImZkciIpKSAlPiUgCiAgYXJyYW5nZShwLnZhbHVlKSAlPiUgCiAgaGVhZCgpCmBgYAoKYGBge3J9CmxtX2FkaiA9IAogIGxtX3JlcyAlPiUgCiAgZmlsdGVyKHRlcm0gPT0gInNwbGljZW9zb21lX211dGF0ZWRZRVMiKSAlPiUgCiAgbXV0YXRlKGZkcj1wLmFkanVzdChwLnZhbHVlLCBtZXRob2QgPSAiZmRyIikpICU+JSAKICBhcnJhbmdlKHAudmFsdWUpIAoKaGVhZChsbV9hZGopCmBgYAoKRXh0cmFjdCBGRFIgYWRqdXN0ZWQgcC12YWx1ZXMgYW5kIGxvZ0ZDIChlc3RpbWF0ZSkKYGBge3J9CnBfdmFsID0gCiAgbG1fYWRqICU+JSAKICBwdWxsKHAudmFsdWUpCgpwX2FkaiA9IAogIGxtX2FkaiAlPiUgCiAgcHVsbChmZHIpCgpsb2cyZmMgPSAKICBsbV9hZGogJT4lIAogIHB1bGwoZXN0aW1hdGUpCgoKbmFtZXMocF92YWwpID0gCiAgbG1fYWRqICU+JSAKICBwdWxsKEh1Z29fU3ltYm9sKQoKbmFtZXMocF9hZGopID0gCiAgbG1fYWRqICU+JSAKICBwdWxsKEh1Z29fU3ltYm9sKQoKbmFtZXMobG9nMmZjKSA9CiAgbG1fYWRqICU+JSAKICBwdWxsKEh1Z29fU3ltYm9sKQpgYGAKCmBgYHtyfQpjYXQoInAtdmFsdWVzIFxuIikKaGVhZChwX2FkaikKYGBgCgpgYGB7cn0KY2F0KCJcbmxvZzItRkMgXG4iKQpoZWFkKGxvZzJmYykKYGBgCgpSdW4gUElBTk8KRm9yIGtlZ2cgZ3MuCmBgYHtyfQpsaWJyYXJ5KHNub3dmYWxsKQpnc2FfYWRqX2tlZ2cgPC0gcnVuR1NBKHBfYWRqLCAKICAgICAgICAgICAgICAgICAgbG9nMmZjLAogICAgICAgICAgICAgICAgICBnc2MgPSBrZWdnX2dzYywKICAgICAgICAgICAgICAgICAgbmNwdXM9OCwgCiAgICAgICAgICAgICAgICAgIGdlbmVTZXRTdGF0ID0gInJlcG9ydGVyIiwKICAgICAgICAgICAgICAgICAgc2lnbmlmTWV0aG9kID0gIm51bGxEaXN0IiwKICAgICAgICAgICAgICAgICAgblBlcm0gPSAxMDAwMCkKCmdzYV9yZXMKYGBgCgpgYGB7ciwgZmlnLndpZHRoPTgsIGZpZy5oZWlnaHQ9OH0KR1NBaGVhdG1hcChnc2FSZXMgPSBnc2FfcmVzKQpgYGAKClJ1biBQSUFOTyBvbiBLRUdHIGdzLCB1c2luZyBub24tYWRqIHAtdmFsdWVzIApgYGB7cn0KZ3NhX2tlZ2cgPC0gcnVuR1NBKHBfdmFsLCAKICAgICAgICAgICAgICAgICAgbG9nMmZjLAogICAgICAgICAgICAgICAgICBnc2MgPSBrZWdnX2dzYywKICAgICAgICAgICAgICAgICAgbmNwdXM9OCwgCiAgICAgICAgICAgICAgICAgIGdlbmVTZXRTdGF0ID0gInJlcG9ydGVyIiwKICAgICAgICAgICAgICAgICAgc2lnbmlmTWV0aG9kID0gIm51bGxEaXN0IiwKICAgICAgICAgICAgICAgICAgblBlcm0gPSAxMDAwMCkKCmBgYAoKYGBge3IsIGZpZy53aWR0aD04LCBmaWcuaGVpZ2h0PTh9CkdTQWhlYXRtYXAoZ3NhUmVzID0gZ3NhX2Fkal9rZWdnKQpgYGAKUnVuIFBJQU5PIG9uIEtFR0cgZ3MsIHVzaW5nIGdlbmUgc2FtcGxpbmcgdG8gYXNzZXNzIHNpZ25pZmljYW5jZSAKYGBge3J9CmdzYV9rZWdnX2dlbmVTIDwtIHJ1bkdTQShwX3ZhbCwgCiAgICAgICAgICAgICAgICAgIGxvZzJmYywKICAgICAgICAgICAgICAgICAgZ3NjID0ga2VnZ19nc2MsCiAgICAgICAgICAgICAgICAgIG5jcHVzPTgsIAogICAgICAgICAgICAgICAgICBnZW5lU2V0U3RhdCA9ICJyZXBvcnRlciIsCiAgICAgICAgICAgICAgICAgIHNpZ25pZk1ldGhvZCA9ICJnZW5lU2FtcGxpbmciLAogICAgICAgICAgICAgICAgICBuUGVybSA9IDEwMDAwKQoKYGBgCgpgYGB7ciwgZmlnLndpZHRoPTgsIGZpZy5oZWlnaHQ9OH0KR1NBaGVhdG1hcChnc2FSZXMgPSBnc2Ffa2VnZ19nZW5lUykKYGBgCgpSdW4gUElBTk8gb24gaGFsbG1hcmtzIGdzYywgdGhlb3JldGljYWwgbnVsbCBkaXN0cmlidXRpb24uCmBgYHtyfQpnc2FfcmVzIDwtIHJ1bkdTQShwX2FkaiwgCiAgICAgICAgICAgICAgICAgIGxvZzJmYywKICAgICAgICAgICAgICAgICAgZ3NjID0gaGFsbG1hcmtzX2dzYywKICAgICAgICAgICAgICAgICAgbmNwdXM9OCwgCiAgICAgICAgICAgICAgICAgIGdlbmVTZXRTdGF0ID0gInJlcG9ydGVyIiwKICAgICAgICAgICAgICAgICAgc2lnbmlmTWV0aG9kID0gIm51bGxEaXN0IiwKICAgICAgICAgICAgICAgICAgblBlcm0gPSAxMDAwMCkKYGBgCgpgYGB7ciwgZmlnLndpZHRoPTgsIGZpZy5oZWlnaHQ9OH0KR1NBaGVhdG1hcChnc2FSZXMgPSBnc2FfcmVzKQpgYGAKUnVuIFBJQU5PIG9uIGhhbGxtYXJrcyBnc2MsIGdlbmUgc2FtcGxpbmcuCmBgYHtyfQpnc2FfcmVzIDwtIHJ1bkdTQShwX2FkaiwgCiAgICAgICAgICAgICAgICAgIGxvZzJmYywKICAgICAgICAgICAgICAgICAgZ3NjID0gaGFsbG1hcmtzX2dzYywKICAgICAgICAgICAgICAgICAgbmNwdXM9OCwgCiAgICAgICAgICAgICAgICAgIGdlbmVTZXRTdGF0ID0gInJlcG9ydGVyIiwKICAgICAgICAgICAgICAgICAgc2lnbmlmTWV0aG9kID0gImdlbmVTYW1wbGluZyIsCiAgICAgICAgICAgICAgICAgIG5QZXJtID0gMTAwMDApCmBgYAoKYGBge3IsIGZpZy53aWR0aD04LCBmaWcuaGVpZ2h0PTh9CkdTQWhlYXRtYXAoZ3NhUmVzID0gZ3NhX3JlcykKYGBgCgpFeHRyYWN0IG1hdHJpeApgYGB7cn0KCmBgYAoKUHJvZHVjZSB0aWR5IGhlYXRtYXAKYGBge3J9CgpgYGAKCg==